In [145]:
import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn import tree
from sklearn import ensemble

import pytz
import itertools
import visualize
import utils
import pydotplus
import xgboost as xgb

from sklearn import metrics
from sklearn import model_selection

import pvlib
import pv_clf

import visualize_plotly as visualize
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

from IPython.display import Image

%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=4)
%matplotlib notebook
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Ground predictions

Only making ground predictions using PVLib clearsky model and statistical model. NSRDB model won't be available to ground measurements.

read data and split for testing/training

In [178]:
nsrdb = pd.read_pickle('abq_nsrdb_1.pkl.gz')
nsrdb.index = nsrdb.index.tz_convert('MST')
In [179]:
test = nsrdb[nsrdb.index >= '01-01-2014']
train = nsrdb[nsrdb.index < '01-01-2014']
In [180]:
clf = pv_clf.RandomForestClassifierPV()
In [181]:
X_train = np.asarray([train.index.values, train['GHI'].values, train['Clearsky GHI pvlib'].values]).T
y_train = train['sky_status'].values
X_test = np.asarray([test.index.values, test['GHI'].values, test['Clearsky GHI pvlib'].values]).T
y_test = test['sky_status'].values
In [182]:
clf.fit(X_train, y_train)
Out[182]:
RandomForestClassifierPV(bootstrap=True, by_day=True, class_weight=None,
             criterion='gini', max_depth=None, max_features='auto',
             max_leaf_nodes=None, min_impurity_split=0.0,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_iter=20,
             n_jobs=1, oob_score=False, random_state=None,
             scale_for_fit=True, tol=1e-08, verbose=0, warm_start=False)
In [183]:
y_pred = clf.predict(X_test)
/Users/benellis/duramat/clearsky_detection/pv_clf.py:141: RuntimeWarning:

Maximum iterations met (|alpha - alpha_last| = 0.00016072683638412855

In [184]:
vis = visualize.Visualizer()
vis.add_line_ser(test['GHI'])
vis.add_line_ser(test['Clearsky GHI pvlib'] * clf.alpha_scale)
vis.add_circle_ser(test[y_pred]['GHI'])
vis.show()
In [185]:
metrics.accuracy_score(y_test, y_pred)
Out[185]:
0.9040239726027397
In [191]:
np.bincount(y_pred) / len(y_pred), np.bincount(y_test) / len(y_test)
Out[191]:
(array([ 0.7683,  0.2317]), array([ 0.7336,  0.2664]))
In [192]:
test = nsrdb[nsrdb.index >= '01-01-2013']
train = nsrdb[nsrdb.index < '01-01-2013']
In [193]:
clf = pv_clf.RandomForestClassifierPV()
In [194]:
X_train = np.asarray([train.index.values, train['GHI'].values, train['Clearsky GHI pvlib'].values]).T
y_train = train['sky_status'].values
X_test = np.asarray([test.index.values, test['GHI'].values, test['Clearsky GHI pvlib'].values]).T
y_test = test['sky_status'].values
In [195]:
clf.fit(X_train, y_train)
Out[195]:
RandomForestClassifierPV(bootstrap=True, by_day=True, class_weight=None,
             criterion='gini', max_depth=None, max_features='auto',
             max_leaf_nodes=None, min_impurity_split=0.0,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_iter=20,
             n_jobs=1, oob_score=False, random_state=None,
             scale_for_fit=True, tol=1e-08, verbose=0, warm_start=False)
In [196]:
y_pred = clf.predict(X_test)
In [197]:
vis = visualize.Visualizer()
vis.add_line_ser(test['GHI'])
vis.add_line_ser(test['Clearsky GHI pvlib'] * clf.alpha_scale)
vis.add_circle_ser(test[y_pred]['GHI'])
vis.show()
In [198]:
metrics.accuracy_score(y_test, y_pred)
Out[198]:
0.91953957382039575
In [199]:
np.bincount(y_pred) / len(y_pred), np.bincount(y_test) / len(y_test)
Out[199]:
(array([ 0.743,  0.257]), array([ 0.731,  0.269]))
In [200]:
tscv = TimeSeriesSplit(n_splits=12)
In [201]:
len(X_train) / 12
Out[201]:
21916.0
In [206]:
scores = []
for idx1, idx2 in tscv.split(X_train):
    clf = pv_clf.RandomForestClassifierPV()
    clf.fit(X_train[idx1], y_train[idx1])
    pred = clf.predict(X_train[idx2])
    scores.append(metrics.accuracy_score(y_train[idx2], pred))
    print(np.bincount(pred) / len(pred))
/Users/benellis/duramat/clearsky_detection/pv_clf.py:141: RuntimeWarning:

Maximum iterations met (|alpha - alpha_last| = 0.00040186239806594415

[ 0.8041  0.1959]
/Users/benellis/duramat/clearsky_detection/pv_clf.py:141: RuntimeWarning:

Maximum iterations met (|alpha - alpha_last| = 0.0001886299384797896

[ 0.8938  0.1062]
[ 0.9639  0.0361]
/Users/benellis/duramat/clearsky_detection/pv_clf.py:141: RuntimeWarning:

Maximum iterations met (|alpha - alpha_last| = 1.459866887598693e-06

[ 0.8335  0.1665]
[ 0.7781  0.2219]
[ 0.9755  0.0245]
/Users/benellis/duramat/clearsky_detection/pv_clf.py:141: RuntimeWarning:

Maximum iterations met (|alpha - alpha_last| = 0.00011542035269307416

[ 0.8358  0.1642]
/Users/benellis/duramat/clearsky_detection/pv_clf.py:141: RuntimeWarning:

Maximum iterations met (|alpha - alpha_last| = 0.019211587573317535

[ 0.9791  0.0209]
/Users/benellis/duramat/clearsky_detection/pv_clf.py:141: RuntimeWarning:

Maximum iterations met (|alpha - alpha_last| = 0.15011532280285877

[ 0.95  0.05]
/Users/benellis/duramat/clearsky_detection/pv_clf.py:141: RuntimeWarning:

Maximum iterations met (|alpha - alpha_last| = 0.17448502381559228

[ 0.9624  0.0376]
/Users/benellis/duramat/clearsky_detection/pv_clf.py:141: RuntimeWarning:

Maximum iterations met (|alpha - alpha_last| = 0.0002645947731791942

[ 0.8995  0.1005]
[ 0.9105  0.0895]
In [207]:
np.mean(scores), np.std(scores)
Out[207]:
(0.78516641950897992, 0.061049044141449384)
In [ ]: